Workspace setting and loading dataset
conda install -c districtdatalabs yellowbrick
Collecting package metadata (current_repodata.json): ...working... failed Note: you may need to restart the kernel to use updated packages.
CondaHTTPError: HTTP 000 CONNECTION FAILED for url <https://conda.anaconda.org/districtdatalabs/win-64/current_repodata.json> Elapsed: - An HTTP error occurred when trying to retrieve this URL. HTTP errors are often intermittent, and a simple retry will get you on your way. 'https://conda.anaconda.org/districtdatalabs/win-64'
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
from sklearn import metrics
dataset = pd.read_csv('cluster.csv')
dataset.head()
| Sample ID | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | ... | P11 | P12 | P13 | P14 | P15 | P16 | P17 | P18 | P19 | P20 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 3.30 | 7.44 | 1.52 | 3.27 | 0.07 | 2.14 | 0.75 | 0.66 | 0.0 | ... | 49.7 | 50.7 | 6.55 | 4.09 | 4.26 | 0.01 | 0.00 | 24.7 | 2.7 | 1.6 |
| 1 | 2 | 3.43 | 7.63 | 1.63 | 3.27 | 0.05 | 2.01 | 0.74 | 0.65 | 0.0 | ... | 47.3 | 47.9 | 8.35 | 5.08 | 5.01 | 0.01 | 0.00 | 23.3 | 2.3 | 1.8 |
| 2 | 3 | 3.41 | 7.32 | 1.52 | 3.18 | 0.07 | 2.09 | 0.80 | 0.70 | 0.0 | ... | 50.5 | 54.4 | 9.27 | 6.85 | 7.14 | 0.19 | 0.06 | 25.0 | 2.5 | -0.9 |
| 3 | 4 | 3.78 | 7.85 | 1.69 | 3.35 | 0.03 | 1.98 | 0.77 | 0.67 | 0.0 | ... | 47.2 | 48.9 | 10.26 | 5.96 | 5.47 | 0.05 | 0.01 | 24.1 | 5.6 | 2.1 |
| 4 | 5 | 3.90 | 7.99 | 1.61 | 3.43 | 0.02 | 2.14 | 0.77 | 0.71 | 0.0 | ... | 54.1 | 54.1 | 8.19 | 5.81 | 4.72 | 0.64 | 0.16 | 26.8 | 2.5 | 2.1 |
5 rows × 21 columns
dataset.describe(include = "all")
| Sample ID | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | ... | P11 | P12 | P13 | P14 | P15 | P16 | P17 | P18 | P19 | P20 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 400.000000 | 400.000000 | 400.000000 | 400.000000 | 400.00000 | 400.000000 | 400.000000 | 400.000000 | 400.000000 | 400.000000 | ... | 400.000000 | 400.000000 | 400.000000 | 400.000000 | 400.00000 | 400.000000 | 400.000000 | 400.000000 | 400.000000 | 400.000000 |
| mean | 200.500000 | 3.441575 | 7.403225 | 1.595625 | 3.12340 | 0.056550 | 1.970525 | 0.787700 | 0.690450 | 0.020500 | ... | 81.321250 | 70.146250 | 13.666600 | 10.190050 | 8.39965 | 0.528475 | 0.155375 | 39.988250 | 10.069250 | 10.600000 |
| std | 115.614301 | 0.395690 | 0.421662 | 0.140773 | 0.19666 | 0.015188 | 0.196999 | 0.027931 | 0.017652 | 0.092878 | ... | 44.305153 | 36.149349 | 3.934187 | 3.384236 | 3.35507 | 1.017907 | 0.301290 | 16.699209 | 5.942395 | 6.213398 |
| min | 1.000000 | 2.410000 | 6.280000 | 1.180000 | 2.60000 | 0.020000 | 1.220000 | 0.670000 | 0.640000 | 0.000000 | ... | 42.600000 | 42.100000 | 5.700000 | 4.070000 | 3.49000 | 0.000000 | 0.000000 | 20.200000 | 0.100000 | -0.900000 |
| 25% | 100.750000 | 3.167500 | 7.100000 | 1.520000 | 3.00000 | 0.040000 | 1.870000 | 0.770000 | 0.680000 | 0.000000 | ... | 53.675000 | 49.800000 | 10.930000 | 7.607500 | 6.01750 | 0.000000 | 0.000000 | 28.875000 | 3.875000 | 6.400000 |
| 50% | 200.500000 | 3.435000 | 7.420000 | 1.610000 | 3.10000 | 0.060000 | 1.980000 | 0.790000 | 0.690000 | 0.000000 | ... | 59.850000 | 53.200000 | 13.535000 | 9.840000 | 7.34500 | 0.010000 | 0.000000 | 33.500000 | 11.250000 | 10.900000 |
| 75% | 300.250000 | 3.710000 | 7.710000 | 1.690000 | 3.27000 | 0.070000 | 2.090000 | 0.810000 | 0.700000 | 0.000000 | ... | 77.350000 | 60.625000 | 15.922500 | 12.415000 | 9.82500 | 0.270000 | 0.072500 | 41.725000 | 15.325000 | 14.325000 |
| max | 400.000000 | 4.720000 | 8.630000 | 2.210000 | 3.69000 | 0.130000 | 2.690000 | 0.870000 | 0.760000 | 0.600000 | ... | 183.500000 | 165.600000 | 31.000000 | 22.180000 | 21.88000 | 3.730000 | 0.990000 | 76.300000 | 21.800000 | 28.600000 |
8 rows × 21 columns
features = dataset.iloc[:, 0:19]
target = dataset.iloc[:, -1]
model = KMeans()
visualizer = KElbowVisualizer(model, k=(1,10))
visualizer.fit(features) # Fit the data to the visualizer
visualizer.poof() # Draw/show/poof the data
<AxesSubplot:title={'center':'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>
this clustering is optimal when 4 clusters are used. We used k-means and k-mediods with k = 4 in following clustering experiments.
kmeans = KMeans(n_clusters=4)
kmeans.fit(features)
cluster_labels = kmeans.fit_predict(features)
kmeans.cluster_centers_
array([[2.75500000e+02, 3.41564706e+00, 7.36594118e+00, 1.59029412e+00,
3.10417647e+00, 5.65294118e-02, 1.96458824e+00, 7.90470588e-01,
6.92411765e-01, 1.73472348e-17, 1.00064118e+02, 6.55264706e+01,
5.39164706e+01, 1.54597059e+01, 1.02800000e+01, 7.26882353e+00,
1.13652941e+00, 3.34411765e-01, 3.57988235e+01],
[1.04833333e+02, 3.29880000e+00, 7.29246667e+00, 1.54973333e+00,
3.10026667e+00, 5.68666667e-02, 2.01000000e+00, 7.78333333e-01,
6.86200000e-01, 5.42000000e-02, 7.02166667e+01, 5.31293333e+01,
5.05346667e+01, 1.24664667e+01, 7.92340000e+00, 6.68140000e+00,
9.39333333e-02, 2.81333333e-02, 2.78873333e+01],
[6.05000000e+01, 3.89000000e+00, 7.86075000e+00, 1.68000000e+00,
3.32125000e+00, 5.27500000e-02, 1.99175000e+00, 7.90250000e-01,
6.98000000e-01, 1.75000000e-03, 1.75762500e+02, 1.67522500e+02,
1.39447500e+02, 1.16520000e+01, 1.39225000e+01, 1.38970000e+01,
5.40000000e-02, 1.32500000e-02, 7.15425000e+01],
[3.80500000e+02, 3.63875000e+00, 7.51950000e+00, 1.70600000e+00,
3.09400000e+00, 5.92500000e-02, 1.82650000e+00, 8.08500000e-01,
6.90500000e-01, 1.04083409e-17, 1.74490000e+02, 1.67967500e+02,
1.43365000e+02, 1.25610000e+01, 1.45752500e+01, 1.41517500e+01,
4.82500000e-02, 1.37500000e-02, 7.16175000e+01]])
Calculate silhouette coefficient for above clustering
silhouette_avg = metrics.silhouette_score(features, cluster_labels)
print ('silhouette coefficient for the above clutering = ', silhouette_avg)
silhouette coefficient for the above clutering = 0.6037875020262065
o.60 would be a fair value for silhouette coeffiencit. -1 is the worst and +1 is the optimal
Install pyclustering on kernel
!pip install pyclustering
from pyclustering.cluster.kmedoids import kmedoids
Requirement already satisfied: pyclustering in c:\users\dell\anaconda3\lib\site-packages (0.10.1.2) Requirement already satisfied: matplotlib>=3.0.0 in c:\users\dell\anaconda3\lib\site-packages (from pyclustering) (3.3.2) Requirement already satisfied: scipy>=1.1.0 in c:\users\dell\anaconda3\lib\site-packages (from pyclustering) (1.5.2) Requirement already satisfied: numpy>=1.15.2 in c:\users\dell\anaconda3\lib\site-packages (from pyclustering) (1.19.2) Requirement already satisfied: Pillow>=5.2.0 in c:\users\dell\anaconda3\lib\site-packages (from pyclustering) (8.0.1) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\dell\anaconda3\lib\site-packages (from matplotlib>=3.0.0->pyclustering) (1.3.0) Requirement already satisfied: python-dateutil>=2.1 in c:\users\dell\anaconda3\lib\site-packages (from matplotlib>=3.0.0->pyclustering) (2.8.1) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\users\dell\anaconda3\lib\site-packages (from matplotlib>=3.0.0->pyclustering) (2.4.7) Requirement already satisfied: certifi>=2020.06.20 in c:\users\dell\anaconda3\lib\site-packages (from matplotlib>=3.0.0->pyclustering) (2020.6.20) Requirement already satisfied: cycler>=0.10 in c:\users\dell\anaconda3\lib\site-packages (from matplotlib>=3.0.0->pyclustering) (0.10.0) Requirement already satisfied: six>=1.5 in c:\users\dell\anaconda3\lib\site-packages (from python-dateutil>=2.1->matplotlib>=3.0.0->pyclustering) (1.15.0)
# Randomly pick 4 indexs from the original sample as the mediods
initial_medoids = [1,100,250,200]
# Create instance of K-Medoids algorithm with prepared centers.
kmedoids_instance = kmedoids(features.values.tolist(), initial_medoids)
# Run cluster analysis.
kmedoids_instance.process()
# predict function is not availble in the release branch yet.
# cluster_labels = kmedoids_instance.predict(features.values)
clusters = kmedoids_instance.get_clusters()
# Prepare cluster labels
cluster_labels = np.zeros([400], dtype=int)
for x in np.nditer(np.asarray(clusters[1])):
cluster_labels[x] = 1
for x in np.nditer(np.asarray(clusters[2])):
cluster_labels[x] = 2
for x in np.nditer(np.asarray(clusters[3])):
cluster_labels[x] = 3
cluster_labels
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3])
# Mediods found in above clustering, indexes are shouwn below.
kmedoids_instance.get_medoids()
[59, 93, 267, 377]
silhouette_avg = metrics.silhouette_score(features, cluster_labels)
print ('silhouette coefficient for the above clutering = ', silhouette_avg)
silhouette coefficient for the above clutering = 0.6027565162885575
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
da = pd.read_csv("cluster.csv")
print(da.head())
Sample ID P1 P2 P3 P4 P5 P6 P7 P8 P9 ... P11 \
0 1 3.30 7.44 1.52 3.27 0.07 2.14 0.75 0.66 0.0 ... 49.7
1 2 3.43 7.63 1.63 3.27 0.05 2.01 0.74 0.65 0.0 ... 47.3
2 3 3.41 7.32 1.52 3.18 0.07 2.09 0.80 0.70 0.0 ... 50.5
3 4 3.78 7.85 1.69 3.35 0.03 1.98 0.77 0.67 0.0 ... 47.2
4 5 3.90 7.99 1.61 3.43 0.02 2.14 0.77 0.71 0.0 ... 54.1
P12 P13 P14 P15 P16 P17 P18 P19 P20
0 50.7 6.55 4.09 4.26 0.01 0.00 24.7 2.7 1.6
1 47.9 8.35 5.08 5.01 0.01 0.00 23.3 2.3 1.8
2 54.4 9.27 6.85 7.14 0.19 0.06 25.0 2.5 -0.9
3 48.9 10.26 5.96 5.47 0.05 0.01 24.1 5.6 2.1
4 54.1 8.19 5.81 4.72 0.64 0.16 26.8 2.5 2.1
[5 rows x 21 columns]
da["a"] = da[["P1"]]
da["b"] = da[["P2"]]
da["d"] = da[["P3"]]
da = da[["a", "b","d"]]
print(da.head())
a b d 0 3.30 7.44 1.52 1 3.43 7.63 1.63 2 3.41 7.32 1.52 3 3.78 7.85 1.69 4 3.90 7.99 1.61
from sklearn.cluster import Birch
model = Birch(branching_factor=30, n_clusters=5, threshold=2.5)
model.fit(da)
pred = model.predict(da)
plt.scatter(da["a"], da["b"], da["d"] ,c=pred, cmap='rainbow', alpha=0.5, edgecolors='b')
plt.show()
C:\Users\DELL\anaconda3\lib\site-packages\sklearn\cluster\_birch.py:646: ConvergenceWarning: Number of subclusters found (1) by Birch is less than (5). Decrease the threshold. warnings.warn(